Loading libraries

library(GEOquery)
library(oligo)
# library(affy)
library(limma)
library(sva)
library(tidyverse)
library(factoextra)
library(pheatmap)
library(caret)
library(RColorBrewer)
# library(viridis)
library(ggsci)

Custom functions

# given a matrix, perform min-max scaling on its columns
min_max_mat <- function(mat){
  mat_rescaled <- apply(mat, 2, function(v){
    v_range <- range(v)
    names(v_range) <- c("minimum", "maximum")
    range_difference <- v_range["maximum"] - v_range["minimum"]
    rescaled <- (v - v_range["minimum"])/range_difference
    return(rescaled)
  })
  return(mat_rescaled)
}

Getting data from GEOquery

# geodata <- GEOquery::getGEO(GEO = "GSE76275", destdir = "./tempfiles")
# geodata <- GEOquery::getGEO(filename = "./tempfiles/GSE76275_series_matrix.txt.gz")
# saveRDS(geodata, "geodata.RDS")
geodata <- readRDS("geodata.RDS")
# mdata <- geodata %>% 
#   pluck(1) %>% 
#   phenoData() %>%
#   pData() %>% as_tibble()
# feature_data <- geodata %>% 
#   pluck(1) %>% 
#   featureData()
  
# write_csv(mdata, "raw_mdata.csv")
mdata <- read_csv("raw_mdata.csv")
Rows: 265 Columns: 69
── Column specification ──────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (62): title, geo_accession, status, submission_date, last_update_date, type, source_name_ch1, organism...
dbl  (6): channel_count, taxid_ch1, contact_zip/postal_code, data_row_count, age (years):ch1, body mass in...
lgl  (1): growth_protocol_ch1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# saveRDS(feature_data, "featureData.RDS")
# feature_data <- readRDS("featureData.RDS")

Inspecting and cleaning the metadata

mdata %>% 
  glimpse()
Rows: 265
Columns: 69
$ title                                <chr> "S1-H10", "S1-H14", "S1-H19", "S1-H20B", "S1-H22", "S1-H27", "S…
$ geo_accession                        <chr> "GSM1974566", "GSM1974567", "GSM1974568", "GSM1974569", "GSM197…
$ status                               <chr> "Public on Dec 18 2015", "Public on Dec 18 2015", "Public on De…
$ submission_date                      <chr> "Dec 17 2015", "Dec 17 2015", "Dec 17 2015", "Dec 17 2015", "De…
$ last_update_date                     <chr> "Dec 18 2015", "Dec 18 2015", "Dec 18 2015", "Dec 18 2015", "De…
$ type                                 <chr> "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", …
$ channel_count                        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ source_name_ch1                      <chr> "Human breast cancer", "Human breast cancer", "Human breast can…
$ organism_ch1                         <chr> "Homo sapiens", "Homo sapiens", "Homo sapiens", "Homo sapiens",…
$ characteristics_ch1                  <chr> "tissue: Breast cancer", "tissue: Breast cancer", "tissue: Brea…
$ characteristics_ch1.1                <chr> "gender: Female", "age (years): 41", "age (years): 55", "age (y…
$ characteristics_ch1.2                <chr> "race: Caucasian", "gender: Female", "gender: Female", "gender:…
$ characteristics_ch1.3                <chr> "body mass index: 32", "race: Caucasian", "race: Caucasian", "r…
$ characteristics_ch1.4                <chr> "menopausal status: Post-Menopausal", "body mass index: 29", "h…
$ characteristics_ch1.5                <chr> "histology group: Infiltrating Ductal Carcinoma", "menopausal s…
$ characteristics_ch1.6                <chr> "histology: Infiltrating Ductal Carcinoma", "histology group: I…
$ characteristics_ch1.7                <chr> "ajcc stage (7th edition, 2010): T2N1M0", "histology: Infiltrat…
$ characteristics_ch1.8                <chr> "set: Validation TN", "ajcc stage (7th edition, 2010): T1N0M0",…
$ characteristics_ch1.9                <chr> "er: Negative", "set: Validation TN", "pr: Negative", "set: Val…
$ characteristics_ch1.10               <chr> "pr: Negative", "er: Negative", "her2: Negative", "er: Negative…
$ characteristics_ch1.11               <chr> "her2: Negative", "pr: Negative", "triple-negative status: TN",…
$ characteristics_ch1.12               <chr> "triple-negative status: TN", "her2: Negative", "tumor grade: P…
$ characteristics_ch1.13               <chr> "tumor size: 2 - 5 cm", "triple-negative status: TN", "tumor si…
$ characteristics_ch1.14               <chr> "positive nodes: 1 - 3", "tumor grade: Poorly Differentiated", …
$ characteristics_ch1.15               <chr> "metastases: No mets", "tumor size: <=2cm", "metastases: No met…
$ characteristics_ch1.16               <chr> "tnbc subtype: Mesenchymal (MES)", "positive nodes: 0", "tnbc s…
$ characteristics_ch1.17               <chr> NA, "metastases: No mets", NA, "metastases: No mets", NA, NA, N…
$ characteristics_ch1.18               <chr> NA, "tnbc subtype: Basal-Like Immune-Activated (BLIA)", NA, "tn…
$ treatment_protocol_ch1               <chr> "breast tumor samples, flash frozen", "breast tumor samples, fl…
$ growth_protocol_ch1                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ molecule_ch1                         <chr> "total RNA", "total RNA", "total RNA", "total RNA", "total RNA"…
$ extract_protocol_ch1                 <chr> "Trizol extraction of total RNA was performed according to the …
$ label_ch1                            <chr> "biotin", "biotin", "biotin", "biotin", "biotin", "biotin", "bi…
$ label_protocol_ch1                   <chr> "3' IVT express", "3' IVT express", "3' IVT express", "3' IVT e…
$ taxid_ch1                            <dbl> 9606, 9606, 9606, 9606, 9606, 9606, 9606, 9606, 9606, 9606, 960…
$ hyb_protocol                         <chr> "Standard Affymetix protocol for 3' IVT design", "Standard Affy…
$ scan_protocol                        <chr> "Scanner GCS3000 7G", "Scanner GCS3000 7G", "Scanner GCS3000 7G…
$ description                          <chr> "log2 gene expression data", "log2 gene expression data", "log2…
$ data_processing                      <chr> "The data were analyzed with  Affy package in R, RMA method", "…
$ data_processing.1                    <chr> "Expressions estimated using RMA signal method", "Expressions e…
$ platform_id                          <chr> "GPL570", "GPL570", "GPL570", "GPL570", "GPL570", "GPL570", "GP…
$ contact_name                         <chr> "Suzanne,,Fuqua", "Suzanne,,Fuqua", "Suzanne,,Fuqua", "Suzanne,…
$ contact_institute                    <chr> "Baylor College of Medicine", "Baylor College of Medicine", "Ba…
$ contact_address                      <chr> "1 Baylor Plaza", "1 Baylor Plaza", "1 Baylor Plaza", "1 Baylor…
$ contact_city                         <chr> "Houston", "Houston", "Houston", "Houston", "Houston", "Houston…
$ contact_state                        <chr> "Texas", "Texas", "Texas", "Texas", "Texas", "Texas", "Texas", …
$ `contact_zip/postal_code`            <dbl> 77030, 77030, 77030, 77030, 77030, 77030, 77030, 77030, 77030, …
$ contact_country                      <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", …
$ supplementary_file                   <chr> "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1974nnn/GSM1974566/s…
$ data_row_count                       <dbl> 54675, 54675, 54675, 54675, 54675, 54675, 54675, 54675, 54675, …
$ `age (years):ch1`                    <dbl> NA, 41, 55, 55, 65, 40, 66, 57, NA, 55, 44, 78, 54, 71, 37, 51,…
$ `ajcc stage (7th edition, 2010):ch1` <chr> "T2N1M0", "T1N0M0", "T2N0M0", "T1cN0M0", "T2N2MX", "T1NXMX", "T…
$ `body mass index:ch1`                <dbl> 32, 29, NA, 31, 38, 22, 22, 38, 29, 32, 44, 28, 34, 32, 23, 23,…
$ `er:ch1`                             <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Ne…
$ `gender:ch1`                         <chr> "Female", "Female", "Female", "Female", "Female", "Female", "Fe…
$ `her2:ch1`                           <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Ne…
$ `histology group:ch1`                <chr> "Infiltrating Ductal Carcinoma", "Infiltrating Ductal Carcinoma…
$ `histology:ch1`                      <chr> "Infiltrating Ductal Carcinoma", "Infiltrating Ductal Carcinoma…
$ `menopausal status:ch1`              <chr> "Post-Menopausal", "Post-Menopausal", NA, "Post-Menopausal", "P…
$ `metastases:ch1`                     <chr> "No mets", "No mets", "No mets", "No mets", NA, NA, NA, NA, "No…
$ `positive nodes:ch1`                 <chr> "1 - 3", "0", "0", "0", "4-9", NA, NA, "0", "0", "4-9", "0", "1…
$ `pr:ch1`                             <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Ne…
$ `race:ch1`                           <chr> "Caucasian", "Caucasian", "Caucasian", "Caucasian", "Caucasian"…
$ `set:ch1`                            <chr> "Validation TN", "Validation TN", "Validation TN", "Validation …
$ `tissue:ch1`                         <chr> "Breast cancer", "Breast cancer", "Breast cancer", "Breast canc…
$ `tnbc subtype:ch1`                   <chr> "Mesenchymal (MES)", "Basal-Like Immune-Activated (BLIA)", "Mes…
$ `triple-negative status:ch1`         <chr> "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN…
$ `tumor grade:ch1`                    <chr> NA, "Poorly Differentiated", "Poorly Differentiated", "Poorly D…
$ `tumor size:ch1`                     <chr> "2 - 5 cm", "<=2cm", "2 - 5 cm", "<=2cm", "2 - 5 cm", "<=2cm", …
mdata <- mdata %>% 
  select(title, contains("date"), geo_accession, contains(":ch1"))

colnames(mdata)
 [1] "title"                              "submission_date"                   
 [3] "last_update_date"                   "geo_accession"                     
 [5] "age (years):ch1"                    "ajcc stage (7th edition, 2010):ch1"
 [7] "body mass index:ch1"                "er:ch1"                            
 [9] "gender:ch1"                         "her2:ch1"                          
[11] "histology group:ch1"                "histology:ch1"                     
[13] "menopausal status:ch1"              "metastases:ch1"                    
[15] "positive nodes:ch1"                 "pr:ch1"                            
[17] "race:ch1"                           "set:ch1"                           
[19] "tissue:ch1"                         "tnbc subtype:ch1"                  
[21] "triple-negative status:ch1"         "tumor grade:ch1"                   
[23] "tumor size:ch1"                    
  
cnames <- colnames(mdata)
cnames_processed <- str_split(cnames, pattern = ":") %>% 
  map_chr(~{.x[[1]]}) %>% 
  str_replace_all(" ", "_") %>% 
  str_replace_all("-", "_") %>% 
  str_remove_all("\\(|\\)|,")

cnames_processed
 [1] "title"                       "submission_date"             "last_update_date"           
 [4] "geo_accession"               "age_years"                   "ajcc_stage_7th_edition_2010"
 [7] "body_mass_index"             "er"                          "gender"                     
[10] "her2"                        "histology_group"             "histology"                  
[13] "menopausal_status"           "metastases"                  "positive_nodes"             
[16] "pr"                          "race"                        "set"                        
[19] "tissue"                      "tnbc_subtype"                "triple_negative_status"     
[22] "tumor_grade"                 "tumor_size"                 
colnames(mdata) <- cnames_processed
rm(cnames, cnames_processed)
glimpse(mdata)
Rows: 265
Columns: 23
$ title                       <chr> "S1-H10", "S1-H14", "S1-H19", "S1-H20B", "S1-H22", "S1-H27", "S1-H28", "…
$ submission_date             <chr> "Dec 17 2015", "Dec 17 2015", "Dec 17 2015", "Dec 17 2015", "Dec 17 2015…
$ last_update_date            <chr> "Dec 18 2015", "Dec 18 2015", "Dec 18 2015", "Dec 18 2015", "Dec 18 2015…
$ geo_accession               <chr> "GSM1974566", "GSM1974567", "GSM1974568", "GSM1974569", "GSM1974570", "G…
$ age_years                   <dbl> NA, 41, 55, 55, 65, 40, 66, 57, NA, 55, 44, 78, 54, 71, 37, 51, 64, NA, …
$ ajcc_stage_7th_edition_2010 <chr> "T2N1M0", "T1N0M0", "T2N0M0", "T1cN0M0", "T2N2MX", "T1NXMX", "T1cNXMX", …
$ body_mass_index             <dbl> 32, 29, NA, 31, 38, 22, 22, 38, 29, 32, 44, 28, 34, 32, 23, 23, 36, 22, …
$ er                          <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Negative", …
$ gender                      <chr> "Female", "Female", "Female", "Female", "Female", "Female", "Female", "F…
$ her2                        <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Negative", …
$ histology_group             <chr> "Infiltrating Ductal Carcinoma", "Infiltrating Ductal Carcinoma", "Infil…
$ histology                   <chr> "Infiltrating Ductal Carcinoma", "Infiltrating Ductal Carcinoma", "Infil…
$ menopausal_status           <chr> "Post-Menopausal", "Post-Menopausal", NA, "Post-Menopausal", "Post-Menop…
$ metastases                  <chr> "No mets", "No mets", "No mets", "No mets", NA, NA, NA, NA, "No mets", N…
$ positive_nodes              <chr> "1 - 3", "0", "0", "0", "4-9", NA, NA, "0", "0", "4-9", "0", "1 - 3", "1…
$ pr                          <chr> "Negative", "Negative", "Negative", "Negative", "Negative", "Negative", …
$ race                        <chr> "Caucasian", "Caucasian", "Caucasian", "Caucasian", "Caucasian", "Caucas…
$ set                         <chr> "Validation TN", "Validation TN", "Validation TN", "Validation TN", "Val…
$ tissue                      <chr> "Breast cancer", "Breast cancer", "Breast cancer", "Breast cancer", "Bre…
$ tnbc_subtype                <chr> "Mesenchymal (MES)", "Basal-Like Immune-Activated (BLIA)", "Mesenchymal …
$ triple_negative_status      <chr> "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", "TN", …
$ tumor_grade                 <chr> NA, "Poorly Differentiated", "Poorly Differentiated", "Poorly Differenti…
$ tumor_size                  <chr> "2 - 5 cm", "<=2cm", "2 - 5 cm", "<=2cm", "2 - 5 cm", "<=2cm", "<=2cm", …
# mdata %>% 
  # distinct(triple_negative_status, tnbc_subtype)
  # distinct(submission_date, triple_negative_status)
  # distinct(tnbc_subtype, triple_negative_status)
  # distinct(histology, histology_group)
  # distinct(er, her2, pr, triple_negative_status)

Looking at the number of samples for each combination of set and each condition.

mdata %>% 
  count(triple_negative_status, set)

Reading in raw probe intensity data

Celfiles downloaded from GEO and kept the folder celfiles/

celFiles <- list.celfiles('celfiles/', full.names = TRUE, listGzipped = TRUE)
celFiles %>% head()
[1] "celfiles//GSM1974566_S1_H10.CEL.gz"  "celfiles//GSM1974567_S1_H14.CEL.gz" 
[3] "celfiles//GSM1974568_S1_H19.CEL.gz"  "celfiles//GSM1974569_S1_H20B.CEL.gz"
[5] "celfiles//GSM1974570_S1_H22.CEL.gz"  "celfiles//GSM1974571_S1_H27.CEL.gz" 
names(celFiles) <- celFiles %>% 
  basename() %>% 
  str_split("\\.") %>% 
  map_chr(~{.x[1]}) %>% 
  str_split("_") %>% 
  map_chr(~{.x[1]}) 

head(celFiles)
                           GSM1974566                            GSM1974567 
 "celfiles//GSM1974566_S1_H10.CEL.gz"  "celfiles//GSM1974567_S1_H14.CEL.gz" 
                           GSM1974568                            GSM1974569 
 "celfiles//GSM1974568_S1_H19.CEL.gz" "celfiles//GSM1974569_S1_H20B.CEL.gz" 
                           GSM1974570                            GSM1974571 
 "celfiles//GSM1974570_S1_H22.CEL.gz"  "celfiles//GSM1974571_S1_H27.CEL.gz" 
head(mdata)
mdata <- mdata[match(mdata$geo_accession, names(celFiles)), ]

Getting only the relevant variables from the metadata.

mdata_subset <- mdata %>%
  select(geo_accession, 
         title, 
         triple_negative_status, 
         tnbc_subtype,
         submission_date,
         er,
         her2,
         pr,
         race,
         set,
         gender, 
         age_years) %>% 
  mutate(across(where(is.character), .fns = factor)) %>% 
  mutate(tnbc_subtype = if_else(is.na(as.character(tnbc_subtype)), "Not Applicable", as.character(tnbc_subtype))) %>% 
  mutate(tnbc_subtype = factor(tnbc_subtype)) %>% 
  as.data.frame()


rownames(mdata_subset) <- as.character(mdata_subset$geo_accession)

head(mdata_subset)
NA
# rawData <- read.celfiles(celFiles, phenoData = AnnotatedDataFrame(mdata_subset))
# saveRDS(object = rawData, "rawData.RDS")
rawData <- readRDS("rawData.RDS")

Looking at the dimensions of the raw expression matrix.

exprs(rawData) %>% dim()
[1] 1354896     265

Using regular rma on data (without separating by class)

res_1 <- rma(rawData)
Loading required package: RSQLite
Loading required package: DBI
Background correcting
Normalizing
Calculating Expression
# saveRDS(object = res_1, "res_1.RDS")
res_1 <- readRDS("res_1.RDS")
exprs(res_1) %>% 
  dim()
[1] 54675   265
exprs(res_1)[1:5, 1:5]
          GSM1974566 GSM1974567 GSM1974568 GSM1974569 GSM1974570
1007_s_at  10.754163  11.361803   9.690693  10.157010  10.597699
1053_at     8.625663   9.011796   7.854072   7.925281   8.456781
117_at      7.333973   7.385199   7.466878   8.048563   7.581895
121_at      9.077887   8.782080   8.885189   8.775218   8.752407
1255_g_at   4.656331   4.635625   4.536114   4.626950   5.454820

Performing class-specific RMA by reading in the expression sets separately

Getting lists of the TNBC samples and the non-TNBC samples.

tnbc_samples <- mdata_subset %>% 
  filter(triple_negative_status == "TN") %>% 
  select(geo_accession) %>% 
  unlist(use.names = F) %>% 
  as.character()

head(tnbc_samples)
[1] "GSM1974566" "GSM1974567" "GSM1974568" "GSM1974569" "GSM1974570" "GSM1974571"
nontnbc_samples <- mdata_subset %>% 
  filter(triple_negative_status == "not TN") %>% 
  select(geo_accession) %>% 
  unlist(use.names = F) %>% 
  as.character()

head(nontnbc_samples)
[1] "GSM1978883" "GSM1978884" "GSM1978885" "GSM1978886" "GSM1978887" "GSM1978888"

Creating different metadata tables for TNBC and nonTNBC.

mdata_subset_tnbc <- mdata_subset[tnbc_samples, ]
dim(mdata_subset_tnbc)
[1] 198  12
mdata_subset_nontnbc <- mdata_subset[nontnbc_samples, ]
dim(mdata_subset_nontnbc)
[1] 67 12

Reading in the TNBC files.

# rawData_tnbc <- read.celfiles(filenames = celFiles[tnbc_samples], 
#                               phenoData = AnnotatedDataFrame(mdata_subset_tnbc))
# 
# rawData_tnbc
# saveRDS(rawData_tnbc, file = "rawData_tnbc.RDS")
rawData_tnbc <- readRDS(file = "rawData_tnbc.RDS")
rawData_tnbc
ExpressionFeatureSet (storageMode: lockedEnvironment)
assayData: 1354896 features, 198 samples 
  element names: exprs 
protocolData
  rowNames: GSM1974566 GSM1974567 ... GSM1974763 (198 total)
  varLabels: exprs dates
  varMetadata: labelDescription channel
phenoData
  rowNames: GSM1974566 GSM1974567 ... GSM1974763 (198 total)
  varLabels: geo_accession title ... age_years (11 total)
  varMetadata: labelDescription channel
featureData: none
experimentData: use 'experimentData(object)'
Annotation: pd.hg.u133.plus.2 
Loading required package: pd.hg.u133.plus.2
Loading required package: RSQLite
Loading required package: DBI

Reading in the nonTNBC files.

# rawData_nontnbc <- read.celfiles(filenames = celFiles[nontnbc_samples], 
#                               phenoData = AnnotatedDataFrame(mdata_subset_nontnbc))
# 
# rawData_nontnbc
# saveRDS(rawData_nontnbc, file = "rawData_nontnbc.RDS")
rawData_nontnbc <- readRDS(file = "rawData_nontnbc.RDS")
rawData_nontnbc
ExpressionFeatureSet (storageMode: lockedEnvironment)
assayData: 1354896 features, 67 samples 
  element names: exprs 
protocolData
  rowNames: GSM1978883 GSM1978884 ... GSM1978949 (67 total)
  varLabels: exprs dates
  varMetadata: labelDescription channel
phenoData
  rowNames: GSM1978883 GSM1978884 ... GSM1978949 (67 total)
  varLabels: geo_accession title ... age_years (11 total)
  varMetadata: labelDescription channel
featureData: none
experimentData: use 'experimentData(object)'
Annotation: pd.hg.u133.plus.2 

Performing RMA on TNBC data.

# res_tnbc <- rma(rawData_tnbc)
# saveRDS(res_tnbc, file = "res_tnbc.RDS")
res_tnbc <- readRDS(file = "res_tnbc.RDS")

Performing RMA on nonTNBC data.

# res_nontnbc <- rma(rawData_nontnbc)
# saveRDS(res_nontnbc, file = "res_nontnbc.RDS")
res_nontnbc <- readRDS(file = "res_nontnbc.RDS")

Combining the expression matrices of TNBC and nonTNBC data after separate RMA.

res_joint <- cbind(exprs(res_tnbc), exprs(res_nontnbc))
res_joint[1:5, 1:5]
          GSM1974566 GSM1974567 GSM1974568 GSM1974569 GSM1974570
1007_s_at  10.703231  11.345325   9.732361  10.175704  10.576463
1053_at     8.605269   9.019553   7.794234   7.945029   8.497373
117_at      7.360884   7.373951   7.481344   8.047586   7.530427
121_at      9.100729   8.776369   8.860402   8.755606   8.766560
1255_g_at   4.664492   4.628692   4.569530   4.627230   5.467292

Saving certain CSV files for everyone else to refer to

Saving the joint expression matrix from class-specific QN.

res_joint %>% 
  as_tibble(rownames = "probe_id") %>% 
  write_csv("dataframe_files/post_classQN_expression.csv")
Error: Cannot open file for writing:
* 'dataframe_files/post_classQN_expression.csv'

Saving a subset of the metadata that I think is relevant.

Saving the TNBC and nonTNBC metadata separately, just in case.

mdata_subset_nontnbc %>% 
  write_csv("dataframe_files/metadata_subset_nontnbc.csv")

Getting sample-specific boxplots

Sample specific boxplots for regular RMA QN

res_1_df_long <- res_1 %>%
  exprs() %>% 
  as_tibble(rownames = "probeID") %>% 
  pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id", 
               values_to = "intensity") %>% 
  left_join(., mdata_subset, by = c("sample_id" = "geo_accession"))
# saveRDS(object = res_1_df_long, "res_1_df_long.RDS")
res_1_df_long <- readRDS("res_1_df_long.RDS")
p2 <- res_1_df_long %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, 
                             color = set)) +
  labs(x = "samples", 
       title = str_wrap("Sample-wise log2 intensity boxplots for whole QN")) +
  scale_color_npg() +
  theme(axis.text.x = element_blank())

p2

NA
ggsave("plots/exploration_plots/GSE76275_post_regQN_boxplots.png", 
       p2, 
       units = "cm", width = 30, height = 10)
rm(res_1_df_long)

Sample specific boxplots for classQN

res_joint_df_long <- res_joint %>% 
  as_tibble(rownames = "probeID") %>% 
  pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id", 
               values_to = "intensity")
  
# saveRDS(object = res_joint_df_long, "res_joint_df_long.RDS")
res_joint_df_long <- readRDS("res_joint_df_long.RDS")
p1 <- res_joint_df_long %>% 
  left_join(., mdata_subset, by = c("sample_id" = "geo_accession")) %>% 
  mutate(sample_id = factor(sample_id)) %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, 
                             color = set)) +
  labs(x = "samples", 
       title = str_wrap("Sample-wise log2 intensity boxplots for  classQN", 60)) +
  scale_color_npg() +
  theme(axis.text.x = element_blank())

p1 

ggsave("plots/exploration_plots/GSE76275_post_classQN_boxplots.png", 
       p1, 
       units = "cm", width = 30, height = 10)

Performing PCA

Custom functions

Function to create an annotated data frame by combining PC scores as well as metadata: useful for ggplot visualization.

get_pca_annot_df <- function(pca.obj, sample_id_col, mdata_df){
  ind_scores <- pca.obj$x
  ind_scores_reordered <- ind_scores[match(rownames(ind_scores), mdata_df[[sample_id_col]]), ] %>% 
    as_tibble(rownames = sample_id_col) %>% 
    mutate(filename = factor(!!sym(sample_id_col)))
  ind_scores_annot <- left_join(ind_scores_reordered, y = mdata_df, by = sample_id_col) %>% 
  select(all_of(colnames(mdata_subset)), contains("PC"))
  return(ind_scores_annot)
}

Performing PCA on regular RMA data

pca.res_1 <- res_1 %>% 
  exprs() %>% 
  t() %>% 
  prcomp(center = TRUE, scale = TRUE)

Getting the annotated data frame for the PCA.

pca.res_1.annot_df <- get_pca_annot_df(pca.obj = pca.res_1, sample_id_col = "geo_accession", mdata_df= mdata_subset)
head(pca.res_1.annot_df)

Visualizing PCA results

Looking at the variance explained by the first 10 PCs.

fviz_eig(pca.res_1) +
  labs(x = "Principal Component", 
       title = str_wrap("Scree plot for the first 10 principal components for regular RMA-normalized data", 60)) +
  theme(title = element_text(size = 15))

Superimposing variables in data upon sample PCA scores. The PCA does not seem to separate the TNBC and nonTNBC samples that well when regular RMA is performed.

ggplot(pca.res_1.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = triple_negative_status)) +
    ggtitle("Samples in first two PCs, \ncoloured by triple_negative_status for whole QN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))


ggsave("plots/exploration_plots/PCA_wholeQN_TNBC_status.png")
Saving 7.29 x 4.51 in image

NA

The samples do not seem to separate well by set either.

ggplot(pca.res_1.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = set)) +
    ggtitle("Samples in first two PCs, \ncoloured by set (discovery or validation) for whole QN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))

ggsave("plots/exploration_plots/PCA_wholeQN_set.png")
Saving 7.29 x 4.51 in image

There does not seem to be too strong of a batch effect according to submission date.

ggplot(pca.res_1.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = submission_date)) +
    ggtitle("Samples in first two PCs, \ncoloured by submission date for whole QN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))

ggsave("plots/exploration_plots/PCA_wholeQN_set.png")
Saving 7.29 x 4.51 in image

ggplot(pca.res_1.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = tnbc_subtype)) +
    ggtitle("Samples in first two PCs, \ncoloured by tnbc_subtype for whole QN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5), legend.position = "right", legend.direction = "vertical", legend.key.width = unit(x = 0.5, units = "cm")) 

Performing PCA on separately-performed RMA data

pca.res_joint <- res_joint %>% 
  t() %>% 
  prcomp(center = TRUE, scale = TRUE)
saveRDS(pca.res_joint, "pca_res_joint.RDS")
# pca.res_joint <- readRDS("pca_res_joint.RDS")

Getting the annotated data frame for the PCA.

pca.res_joint.annot_df <- get_pca_annot_df(pca.obj = pca.res_joint, sample_id_col = "geo_accession", mdata_df= mdata_subset)
head(pca.res_joint.annot_df)

Visualizing PCA results

Looking at the variance explained by the first 10 PCs.

fviz_eig(pca.res_joint) +
  labs(x = "Principal Component", 
       title = str_wrap("Scree plot for the first 10 principal components for classQN-normalized data", 60)) +
  theme(title = element_text(size = 15))

Superimposing variables in data upon sample PCA scores. The PCA does separate the TNBC and nonTNBC samples well when class-specific RMA is performed.

ggplot(pca.res_joint.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = triple_negative_status)) +
    ggtitle("Samples in first two PCs, \ncoloured by triple_negative_status for classQN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))

NA

The validation nonTNBC samples are separated from the discovery TNBC and validation TNBC samples.

ggplot(pca.res_joint.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = set)) +
    ggtitle("Samples in first two PCs, \ncoloured by set (discovery or validation) for classQN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))

Submission date is perfectly confounded with TNBC status. May or may not be batch effects.

ggplot(pca.res_joint.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = submission_date)) +
    ggtitle("Samples in first two PCs, \ncoloured by submission date) for classQN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))

ggplot(pca.res_joint.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = tnbc_subtype)) +
    ggtitle("Samples in first two PCs, \ncoloured by tnbc_subtype for classQN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5), legend.position = "right", legend.direction = "vertical", legend.key.width = unit(x = 0.5, units = "cm")) 

Performing hierarchical clustering

Getting distances

perform_min_max <- function(x){
  mm_transformation <- preProcess(x, method = "range")
  rescaled <- predict(mm_transformation, x)
  return(rescaled)
}

Getting distances after performing min max normalization.

res_1_dists <- exprs(res_1) %>% 
  t() %>% 
  perform_min_max() %>% 
  dist(method = "euclidean")
  
# saveRDS(res_1_dists, "res_1_dists.RDS")
res_1_dists <- readRDS("res_1_dists.RDS")
res_joint_dists <- res_joint %>% 
    t() %>% 
  perform_min_max() %>% 
  dist(method = "euclidean")
  
# saveRDS(res_joint_dists, "res_joint_dists.RDS")
res_joint_dists <- readRDS("res_joint_dists.RDS")

Using dendrograms

res_1_dend <- res_1_dists %>% 
  hclust() %>% 
  as.dendrogram()
res_joint_dend <- res_joint_dists %>%
  hclust() %>% 
  as.dendrogram()
library(dendextend)

---------------------
Welcome to dendextend version 1.15.2
Type citation('dendextend') for how to cite the package.

Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/

Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
You may ask questions at stackoverflow, use the r and dendextend tags: 
     https://stackoverflow.com/questions/tagged/dendextend

    To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
---------------------


Attaching package: ‘dendextend’

The following object is masked from ‘package:Biostrings’:

    nnodes

The following object is masked from ‘package:stats’:

    cutree
# res_1_dend %>% 
#   labels()
# res_1_dend %>% 
#   order.dendrogram()
# (res_1 %>% 
#   exprs() %>% 
#   colnames())[28]
  
res_1_dend_laborder <- res_1_dend %>% 
  labels()
mycolors <- ifelse(mdata_subset[res_1_dend_laborder, ]$triple_negative_status == "TN", "forestgreen", "maroon")
par(mar = c(10,2,1,1))
res_1_dend %>% 
  set("labels_cex", 0.1) %>% 
  plot()

colored_bars(colors = mycolors, dend = res_1_dend, rowLabels = "TN Status", add = TRUE)

res_joint_dend_laborder <- res_joint_dend %>% 
  labels()
mycolors <- ifelse(mdata_subset[res_joint_dend_laborder, ]$triple_negative_status == "TN", "forestgreen", "maroon")
par(mar = c(10,2,1,1))
res_joint_dend %>% 
  set("labels_cex", 0.1) %>% 
  plot()

colored_bars(colors = mycolors, dend = res_joint_dend, rowLabels = "TN Status", add = TRUE)

Using heatmaps

Function to process distance object into a distance matrix for heatmap visualization.

get_distmat <- function(x){
  distmat <- as.matrix(x)
  colnames(distmat) <- NULL
  diag(distmat) <- NA
  return(distmat)
}
row_annot <- mdata_subset %>% 
  select(set, submission_date, tnbc_subtype)

head(row_annot)
set.seed(1)
row_colours <- list( "set" = c("steelblue", "maroon", "gold"), 
                     "submission_date" = sample(colorRampPalette(colors = brewer.pal(n = 8, name = "Set2"))(8), 2),
                     "tnbc_subtype" = sample(colorRampPalette(colors = brewer.pal(n = 8, name = "Dark2"))(8), 5)
                     )

names(row_colours$set) <- as.character(unique(mdata_subset$set))
names(row_colours$submission_date) <- as.character(unique(mdata_subset$submission_date))
names(row_colours$tnbc_subtype) <- as.character(unique(mdata_subset$tnbc_subtype))
str(row_colours)
List of 3
 $ set            : Named chr [1:3] "steelblue" "maroon" "gold"
  ..- attr(*, "names")= chr [1:3] "Validation TN" "Discovery TN" "Validation not TN"
 $ submission_date: Named chr [1:2] "#66C2A5" "#E78AC3"
  ..- attr(*, "names")= chr [1:2] "Dec 17 2015" "Dec 22 2015"
 $ tnbc_subtype   : Named chr [1:5] "#A6761D" "#1B9E77" "#D95F02" "#66A61E" ...
  ..- attr(*, "names")= chr [1:5] "Mesenchymal (MES)" "Basal-Like Immune-Activated (BLIA)" "Basal-Like Immune-Suppressed (BLIS)" "Luminal-AR (LAR)" ...
res_1_dists %>% 
  get_distmat() %>% 
pheatmap(.,
         annotation_row = row_annot, 
         annotation_colors = row_colours,
         show_colnames = F,
         show_rownames = F,
         cutree_rows = 3,
         main = str_wrap("Heatmap of sample distances for whole QN expression matrix", 60),
         legend_labels = c("small distance", "large distance"),
         legend_breaks = c(min(., na.rm = TRUE), 
                         max(., na.rm = TRUE)))

res_joint_dists %>% 
  get_distmat() %>% 
pheatmap(.,
         annotation_row = row_annot, 
         annotation_colors = row_colours,
         show_colnames = F,
         show_rownames = F,
         cutree_rows = 2,
         cutree_cols = 2,
         main = str_wrap("Heatmap of sample distances for class-specific QN expression matrix", 60),
         legend_labels = c("small distance", "large distance"),
         legend_breaks = c(min(., na.rm = TRUE), 
                         max(., na.rm = TRUE)))

Performing SVA

Performing SVA on regular RMA data

full_mod <- mdata_subset %>% 
  select(geo_accession, triple_negative_status) %>% 
  arrange(triple_negative_status) %>% 
  model.matrix(~triple_negative_status, data = .)

head(full_mod)
           (Intercept) triple_negative_statusTN
GSM1978883           1                        0
GSM1978884           1                        0
GSM1978885           1                        0
GSM1978886           1                        0
GSM1978887           1                        0
GSM1978888           1                        0
red_mod <- model.matrix(~1, data = mdata_subset)

head(red_mod)
           (Intercept)
GSM1974566           1
GSM1974567           1
GSM1974568           1
GSM1974569           1
GSM1974570           1
GSM1974571           1

Get number of significant surrogate variables.

n.sv.wholeQN <- num.sv(exprs(res_1), full_mod, method="leek")
n.sv.wholeQN
[1] 0
svobj.wholeQN <- sva(exprs(res_1), mod = full_mod, mod0 = red_mod, n.sv = 1)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
left_join(sv_df.wholeQN, mdata, by = "geo_accession") %>% 
  mutate(index = 5) %>% 
  ggplot() +
  # geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
  geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
  theme_light() +
  labs(y = "Surrogate Variable Value", title = "Distribution of latent variable estimated by SVA for different grouping factors")

  

# ggsave("plots/exploration_plots/sva_grouping_normalRMA.png")

Performing SVA on class-specific quantile normalized data

Create full model matrix.

full_mod <- mdata_subset %>% 
  select(geo_accession, triple_negative_status) %>% 
  arrange(triple_negative_status) %>% 
  model.matrix(~triple_negative_status, data = .)

head(full_mod)
           (Intercept) triple_negative_statusTN
GSM1978883           1                        0
GSM1978884           1                        0
GSM1978885           1                        0
GSM1978886           1                        0
GSM1978887           1                        0
GSM1978888           1                        0

Create reduced model matrix.

red_mod <- model.matrix(~1, data = mdata_subset)

head(red_mod)
           (Intercept)
GSM1974566           1
GSM1974567           1
GSM1974568           1
GSM1974569           1
GSM1974570           1
GSM1974571           1

Get number of significant surrogate variables.

n.sv.classQN <- num.sv(res_joint, full_mod, method="leek")
n.sv.classQN
[1] 1

Perform SVA on classQN-normalized expression matrix.

svobj.classQN <- sva(res_joint, mod = full_mod, mod0 = red_mod, n.sv = n.sv.classQN)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
sv_df.classQN <- tibble("geo_accession" = colnames(res_joint), "sv" = svobj.classQN$sv)

head(sv_df.classQN)
saveRDS(sv_df.classQN, "sv_df_classQN.RDS")
# sv_df.classQN <- readRDS("sv_df_classQN.RDS")
left_join(sv_df.classQN , mdata, by = "geo_accession") %>% 
  mutate(index = 5) %>% 
  ggplot() +
  # geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
  geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
  theme_light() +
  labs(y = "Surrogate Variable Value", title = "Distribution of latent variable estimated by SVA for different grouping factors")
  

ggsave("plots/exploration_plots/sva_grouping_classQN.png")
Saving 7.29 x 4.51 in image

Trying to see if the SVA estimates a batch when QN is not applied

In this attempt, I perform no quantile normalization while performing RMA. If QN has not been performed and a surrogate variable shows up that corresponds to batch, batch effects are probably present.

rawData.summary <- rma(rawData, background = TRUE, normalize = FALSE)
Background correcting
Calculating Expression
rawData.summary_df_long <- rawData.summary %>% 
  exprs() %>% 
  as_tibble(rownames = "probeID") %>% 
  pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id", 
               values_to = "intensity") %>% 
  left_join(., mdata_subset, by = c("sample_id" = "geo_accession"))
p3 <- rawData.summary_df_long %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, 
                             color = set)) +
  labs(x = "samples", 
       title = str_wrap("Sample-wise log2 intensity boxplots in the absence of QN", 60)) +
  scale_color_npg() +
  theme(axis.text.x = element_blank())

p3

ggsave("plots/exploration_plots/GSE76275_noQN_boxplots.png", 
       p1, 
       units = "cm", width = 30, height = 10)

Getting the number of surrogate variables in the absence of quantile normalization.

n.sv.nonorm <- num.sv(exprs(rawData.summary), full_mod, method="leek")

There is one surrogate variable present in the absence of QN.

n.sv.nonorm
[1] 1
svobj.nonorm <- sva(exprs(rawData.summary), mod = full_mod, mod0 = red_mod, n.sv = n.sv.nonorm)
Number of significant surrogate variables is:  1 
Iteration (out of 5 ):1  2  3  4  5  
sv_df.nonorm <- tibble("geo_accession" = colnames(exprs(rawData.summary)), "sv" = svobj.nonorm$sv)

head(sv_df.nonorm)
left_join(sv_df.nonorm, mdata, by = "geo_accession") %>% 
  mutate(index = 5) %>% 
  ggplot() +
  # geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
  geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
  theme_light() +
  labs(y = "Surrogate Variable Value", 
       title = str_wrap("Distribution of latent variable estimated by SVA for different grouping factors", 60))

ggsave("plots/exploration_plots/sva_grouping_noQN.png")
Saving 7.29 x 4.51 in image

---
title: "Explore GSE76275"
output: 
  html_notebook:
    toc: true
    toc_depth: 2
    toc_float: true
---

# Loading libraries

```{r}
library(GEOquery)
library(oligo)
# library(affy)
library(limma)
library(sva)
library(tidyverse)
library(ggsci)
library(factoextra)
library(pheatmap)
library(dendextend)
library(caret)
library(RColorBrewer)
# library(viridis)
```


# Custom functions

```{r}
# given a matrix, perform min-max scaling on its columns
min_max_mat <- function(mat){
  mat_rescaled <- apply(mat, 2, function(v){
    v_range <- range(v)
    names(v_range) <- c("minimum", "maximum")
    range_difference <- v_range["maximum"] - v_range["minimum"]
    rescaled <- (v - v_range["minimum"])/range_difference
    return(rescaled)
  })
  return(mat_rescaled)
}
```

# Getting data from GEOquery

```{r}
# geodata <- GEOquery::getGEO(GEO = "GSE76275", destdir = "./tempfiles")
# geodata <- GEOquery::getGEO(filename = "./tempfiles/GSE76275_series_matrix.txt.gz")
```


```{r}
# saveRDS(geodata, "geodata.RDS")
geodata <- readRDS("geodata.RDS")
```


```{r}
# mdata <- geodata %>% 
#   pluck(1) %>% 
#   phenoData() %>%
#   pData() %>% as_tibble()
```


```{r}
# feature_data <- geodata %>% 
#   pluck(1) %>% 
#   featureData()
  
```


```{r}
# write_csv(mdata, "raw_mdata.csv")
mdata <- read_csv("raw_mdata.csv")
```


```{r}
# saveRDS(feature_data, "featureData.RDS")
# feature_data <- readRDS("featureData.RDS")
```

# Inspecting and cleaning the metadata

```{r}
mdata %>% 
  glimpse()
```


```{r}
mdata <- mdata %>% 
  select(title, contains("date"), geo_accession, contains(":ch1"))

colnames(mdata)
  
```


```{r}
cnames <- colnames(mdata)
```


```{r}
cnames_processed <- str_split(cnames, pattern = ":") %>% 
  map_chr(~{.x[[1]]}) %>% 
  str_replace_all(" ", "_") %>% 
  str_replace_all("-", "_") %>% 
  str_remove_all("\\(|\\)|,")

cnames_processed
```


```{r}
colnames(mdata) <- cnames_processed
rm(cnames, cnames_processed)
```


```{r}
glimpse(mdata)
```


```{r}
# mdata %>% 
  # distinct(triple_negative_status, tnbc_subtype)
  # distinct(submission_date, triple_negative_status)
  # distinct(tnbc_subtype, triple_negative_status)
  # distinct(histology, histology_group)
  # distinct(er, her2, pr, triple_negative_status)
```

Looking at the number of samples for each combination of set and each condition.

```{r}
mdata %>% 
  count(triple_negative_status, set)
```


# Reading in raw probe intensity data

Celfiles downloaded from GEO and kept the folder celfiles/

```{r}
celFiles <- list.celfiles('celfiles/', full.names = TRUE, listGzipped = TRUE)
celFiles %>% head()
```



```{r}
names(celFiles) <- celFiles %>% 
  basename() %>% 
  str_split("\\.") %>% 
  map_chr(~{.x[1]}) %>% 
  str_split("_") %>% 
  map_chr(~{.x[1]}) 

head(celFiles)
```


```{r}
head(mdata)
```

```{r}
mdata <- mdata[match(mdata$geo_accession, names(celFiles)), ]
```


Getting only the relevant variables from the metadata.


```{r}
mdata_subset <- mdata %>%
  select(geo_accession, 
         title, 
         triple_negative_status, 
         tnbc_subtype,
         submission_date,
         er,
         her2,
         pr,
         race,
         set,
         gender, 
         age_years) %>% 
  mutate(across(where(is.character), .fns = factor)) %>% 
  mutate(tnbc_subtype = if_else(is.na(as.character(tnbc_subtype)), "Not Applicable", as.character(tnbc_subtype))) %>% 
  mutate(tnbc_subtype = factor(tnbc_subtype)) %>% 
  as.data.frame()


rownames(mdata_subset) <- as.character(mdata_subset$geo_accession)

head(mdata_subset)

```

```{r}
# rawData <- read.celfiles(celFiles, phenoData = AnnotatedDataFrame(mdata_subset))
```


```{r}
# saveRDS(object = rawData, "rawData.RDS")
rawData <- readRDS("rawData.RDS")
```

Looking at the dimensions of the raw expression matrix.

```{r}
exprs(rawData) %>% dim()
```


## Using regular rma on data (without separating by class)

```{r}
# res_1 <- rma(rawData)
```


```{r}
# saveRDS(object = res_1, "res_1.RDS")
res_1 <- readRDS("res_1.RDS")
```


```{r}
exprs(res_1) %>% 
  dim()
```


```{r}
exprs(res_1)[1:5, 1:5]
```


## Performing class-specific RMA by reading in the expression sets separately


Getting lists of the TNBC samples and the non-TNBC samples.

```{r}
tnbc_samples <- mdata_subset %>% 
  filter(triple_negative_status == "TN") %>% 
  select(geo_accession) %>% 
  unlist(use.names = F) %>% 
  as.character()

head(tnbc_samples)

nontnbc_samples <- mdata_subset %>% 
  filter(triple_negative_status == "not TN") %>% 
  select(geo_accession) %>% 
  unlist(use.names = F) %>% 
  as.character()

head(nontnbc_samples)
```


Creating different metadata tables for TNBC and nonTNBC.

```{r}
mdata_subset_tnbc <- mdata_subset[tnbc_samples, ]
dim(mdata_subset_tnbc)
mdata_subset_nontnbc <- mdata_subset[nontnbc_samples, ]
dim(mdata_subset_nontnbc)
```


Reading in the TNBC files.

```{r}
# rawData_tnbc <- read.celfiles(filenames = celFiles[tnbc_samples], 
#                               phenoData = AnnotatedDataFrame(mdata_subset_tnbc))
# 
# rawData_tnbc
```


```{r}
# saveRDS(rawData_tnbc, file = "rawData_tnbc.RDS")
rawData_tnbc <- readRDS(file = "rawData_tnbc.RDS")
```

```{r}
rawData_tnbc
```


Reading in the nonTNBC files.

```{r}
# rawData_nontnbc <- read.celfiles(filenames = celFiles[nontnbc_samples], 
#                               phenoData = AnnotatedDataFrame(mdata_subset_nontnbc))
# 
# rawData_nontnbc
```



```{r}
# saveRDS(rawData_nontnbc, file = "rawData_nontnbc.RDS")
rawData_nontnbc <- readRDS(file = "rawData_nontnbc.RDS")
```


```{r}
rawData_nontnbc
```


Performing RMA on TNBC data.

```{r}
# res_tnbc <- rma(rawData_tnbc)
```


```{r}
# saveRDS(res_tnbc, file = "res_tnbc.RDS")
res_tnbc <- readRDS(file = "res_tnbc.RDS")
```


Performing RMA on nonTNBC data.

```{r}
# res_nontnbc <- rma(rawData_nontnbc)
```


```{r}
# saveRDS(res_nontnbc, file = "res_nontnbc.RDS")
res_nontnbc <- readRDS(file = "res_nontnbc.RDS")
```


Combining the expression matrices of TNBC and nonTNBC data after separate RMA.

```{r}
res_joint <- cbind(exprs(res_tnbc), exprs(res_nontnbc))
```


```{r}
res_joint[1:5, 1:5]
```

# Saving certain CSV files for everyone else to refer to

Saving the joint expression matrix from class-specific QN.

```{r}
res_joint %>% 
  as_tibble(rownames = "probe_id") %>% 
  write_csv("dataframe_files/post_classQN_expression.csv")
```

Saving a subset of the metadata that I think is relevant.

```{r}
mdata_subset %>% 
  write_csv("dataframe_files/metadata_subset.csv")
```


Saving the TNBC and nonTNBC metadata separately, just in case.

```{r}
mdata_subset_tnbc %>% 
  write_csv("dataframe_files/metadata_subset_tnbc.csv")
```


```{r}
mdata_subset_nontnbc %>% 
  write_csv("dataframe_files/metadata_subset_nontnbc.csv")
```


# Getting sample-specific boxplots

## Sample specific boxplots for regular RMA QN


```{r}
res_1_df_long <- res_1 %>%
  exprs() %>% 
  as_tibble(rownames = "probeID") %>% 
  pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id", 
               values_to = "intensity") %>% 
  left_join(., mdata_subset, by = c("sample_id" = "geo_accession"))
```


```{r}
# saveRDS(object = res_1_df_long, "res_1_df_long.RDS")
res_1_df_long <- readRDS("res_1_df_long.RDS")
```


```{r}
p2 <- res_1_df_long %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, 
                             color = set)) +
  labs(x = "samples", 
       title = str_wrap("Sample-wise log2 intensity boxplots for whole QN", 60)) +
  scale_color_npg() +
  theme(axis.text.x = element_blank())

p2
  
```


```{r}
ggsave("plots/exploration_plots/GSE76275_post_regQN_boxplots.png", 
       p2, 
       units = "cm", width = 30, height = 10)
```


```{r}
rm(res_1_df_long)
```


## Sample specific boxplots for classQN

```{r}
res_joint_df_long <- res_joint %>% 
  as_tibble(rownames = "probeID") %>% 
  pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id", 
               values_to = "intensity")
  
```


```{r}
# saveRDS(object = res_joint_df_long, "res_joint_df_long.RDS")
res_joint_df_long <- readRDS("res_joint_df_long.RDS")
```



```{r}
p1 <- res_joint_df_long %>% 
  left_join(., mdata_subset, by = c("sample_id" = "geo_accession")) %>% 
  mutate(sample_id = factor(sample_id)) %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, 
                             color = set)) +
  labs(x = "samples", 
       title = str_wrap("Sample-wise log2 intensity boxplots for  classQN", 60)) +
  scale_color_npg() +
  theme(axis.text.x = element_blank())

p1 
```


```{r}
ggsave("plots/exploration_plots/GSE76275_post_classQN_boxplots.png", 
       p1, 
       units = "cm", width = 30, height = 10)
```


# Performing PCA

## Custom functions

Function to create an annotated data frame by combining PC scores as well as metadata: useful for ggplot visualization.

```{r}
get_pca_annot_df <- function(pca.obj, sample_id_col, mdata_df){
  ind_scores <- pca.obj$x
  ind_scores_reordered <- ind_scores[match(rownames(ind_scores), mdata_df[[sample_id_col]]), ] %>% 
    as_tibble(rownames = sample_id_col) %>% 
    mutate(filename = factor(!!sym(sample_id_col)))
  ind_scores_annot <- left_join(ind_scores_reordered, y = mdata_df, by = sample_id_col) %>% 
  select(all_of(colnames(mdata_subset)), contains("PC"))
  return(ind_scores_annot)
}
```


## Performing PCA on regular RMA data

```{r}
pca.res_1 <- res_1 %>% 
  exprs() %>% 
  t() %>% 
  prcomp(center = TRUE, scale = TRUE)
```


```{r}
# saveRDS(pca.res_1, "pca_res1.RDS")
pca.res_1 <- readRDS("pca_res1.RDS")
```


Getting the annotated data frame for the PCA.

```{r}
pca.res_1.annot_df <- get_pca_annot_df(pca.obj = pca.res_1, sample_id_col = "geo_accession", mdata_df= mdata_subset)
head(pca.res_1.annot_df)
```

### Visualizing PCA results

Looking at the variance explained by the first 10 PCs.

```{r}
fviz_eig(pca.res_1) +
  labs(x = "Principal Component", 
       title = str_wrap("Scree plot for the first 10 principal components for regular RMA-normalized data", 60)) +
  theme(title = element_text(size = 15))
```

Superimposing variables in data upon sample PCA scores.
The PCA does not seem to separate the TNBC and nonTNBC samples that well when regular RMA is performed.

```{r}
ggplot(pca.res_1.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = triple_negative_status)) +
    ggtitle("Samples in first two PCs, \ncoloured by triple_negative_status for whole QN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))


ggsave("plots/exploration_plots/PCA_wholeQN_TNBC_status.png")
  
```



The samples do not seem to separate well by set either.

```{r}
ggplot(pca.res_1.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = set)) +
    ggtitle("Samples in first two PCs, \ncoloured by set (discovery or validation) for whole QN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))

ggsave("plots/exploration_plots/PCA_wholeQN_set.png")
```

There does not seem to be too strong of a batch effect according to submission date.

```{r}
ggplot(pca.res_1.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = submission_date)) +
    ggtitle("Samples in first two PCs, \ncoloured by submission date for whole QN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))

ggsave("plots/exploration_plots/PCA_wholeQN_set.png")
```


```{r}
ggplot(pca.res_1.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = tnbc_subtype)) +
    ggtitle("Samples in first two PCs, \ncoloured by tnbc_subtype for whole QN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5), legend.position = "right", legend.direction = "vertical", legend.key.width = unit(x = 0.5, units = "cm")) 
```

## Performing PCA on separately-performed RMA data


```{r}
pca.res_joint <- res_joint %>% 
  t() %>% 
  prcomp(center = TRUE, scale = TRUE)
```


```{r}
# saveRDS(pca.res_joint, "pca_res_joint.RDS")
pca.res_joint <- readRDS("pca_res_joint.RDS")
```


Getting the annotated data frame for the PCA.

```{r}
pca.res_joint.annot_df <- get_pca_annot_df(pca.obj = pca.res_joint, sample_id_col = "geo_accession", mdata_df= mdata_subset)
head(pca.res_joint.annot_df)
```

### Visualizing PCA results

Looking at the variance explained by the first 10 PCs.

```{r}
fviz_eig(pca.res_joint) +
  labs(x = "Principal Component", 
       title = str_wrap("Scree plot for the first 10 principal components for classQN-normalized data", 60)) +
  theme(title = element_text(size = 15))
```

Superimposing variables in data upon sample PCA scores.
The PCA **does** separate the TNBC and nonTNBC samples well when class-specific RMA is performed.

```{r}
ggplot(pca.res_joint.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = triple_negative_status)) +
    ggtitle("Samples in first two PCs, \ncoloured by triple_negative_status for classQN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))
  
```
The validation nonTNBC samples are separated from the discovery TNBC and validation TNBC samples.

```{r}
ggplot(pca.res_joint.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = set)) +
    ggtitle("Samples in first two PCs, \ncoloured by set (discovery or validation) for classQN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))
```

Submission date is perfectly confounded with TNBC status. May or may not be batch effects.

```{r}
ggplot(pca.res_joint.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = submission_date)) +
    ggtitle("Samples in first two PCs, \ncoloured by submission date) for classQN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5))
```



```{r}
ggplot(pca.res_joint.annot_df) + 
  geom_point(mapping = aes(x = PC1, y = PC2, colour = tnbc_subtype)) +
    ggtitle("Samples in first two PCs, \ncoloured by tnbc_subtype for classQN") +
  guides(colour = guide_legend(override.aes = list(size= 4))) +
  theme(axis.text.x = element_text(angle = 90, size = 7),
               title = element_text(size = 10),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(colour = "black", size = 0.5), legend.position = "right", legend.direction = "vertical", legend.key.width = unit(x = 0.5, units = "cm")) 
```



# Performing hierarchical clustering

## Getting distances

```{r}
perform_min_max <- function(x){
  mm_transformation <- preProcess(x, method = "range")
  rescaled <- predict(mm_transformation, x)
  return(rescaled)
}
```


Getting distances after performing min max normalization.


```{r}
# res_1_dists <- exprs(res_1) %>% 
#   t() %>% 
#   perform_min_max() %>% 
#   dist(method = "euclidean")
  
```


```{r}
# saveRDS(res_1_dists, "res_1_dists.RDS")
res_1_dists <- readRDS("res_1_dists.RDS")
```



```{r}
# res_joint_dists <- res_joint %>% 
#     t() %>% 
#   perform_min_max() %>% 
#   dist(method = "euclidean")
  
```


```{r}
# saveRDS(res_joint_dists, "res_joint_dists.RDS")
res_joint_dists <- readRDS("res_joint_dists.RDS")
```



## Using dendrograms

```{r}
res_1_dend <- res_1_dists %>% 
  hclust() %>% 
  as.dendrogram()
```


```{r}
res_joint_dend <- res_joint_dists %>%
  hclust() %>% 
  as.dendrogram()
```

 
```{r}
library(dendextend)
```


```{r}
# res_1_dend %>% 
#   labels()
```


```{r}
# res_1_dend %>% 
#   order.dendrogram()
```

```{r}
# (res_1 %>% 
#   exprs() %>% 
#   colnames())[28]
  
```


```{r}
res_1_dend_laborder <- res_1_dend %>% 
  labels()

```


```{r}
mycolors <- ifelse(mdata_subset[res_1_dend_laborder, ]$triple_negative_status == "TN", "forestgreen", "maroon")
```



```{r}
par(mar = c(10,2,1,1))
res_1_dend %>% 
  set("labels_cex", 0.1) %>% 
  plot()

colored_bars(colors = mycolors, dend = res_1_dend, rowLabels = "TN Status", add = TRUE)
```


```{r}
res_joint_dend_laborder <- res_joint_dend %>% 
  labels()
```


```{r}
mycolors <- ifelse(mdata_subset[res_joint_dend_laborder, ]$triple_negative_status == "TN", "forestgreen", "maroon")
```



```{r}
par(mar = c(10,2,1,1))
res_joint_dend %>% 
  set("labels_cex", 0.1) %>% 
  plot()

colored_bars(colors = mycolors, dend = res_joint_dend, rowLabels = "TN Status", add = TRUE)
```


## Using heatmaps

Function to process distance object into a distance matrix for heatmap visualization.

```{r}
get_distmat <- function(x){
  distmat <- as.matrix(x)
  colnames(distmat) <- NULL
  diag(distmat) <- NA
  return(distmat)
}
```



```{r}
row_annot <- mdata_subset %>% 
  select(set, submission_date, tnbc_subtype)

head(row_annot)
```


```{r}
set.seed(1)
row_colours <- list( "set" = c("steelblue", "maroon", "gold"), 
                     "submission_date" = sample(colorRampPalette(colors = brewer.pal(n = 8, name = "Set2"))(8), 2),
                     "tnbc_subtype" = sample(colorRampPalette(colors = brewer.pal(n = 8, name = "Dark2"))(8), 5)
                     )

names(row_colours$set) <- as.character(unique(mdata_subset$set))
names(row_colours$submission_date) <- as.character(unique(mdata_subset$submission_date))
names(row_colours$tnbc_subtype) <- as.character(unique(mdata_subset$tnbc_subtype))
str(row_colours)
```


```{r}
res_1_dists %>% 
  get_distmat() %>% 
pheatmap(.,
         annotation_row = row_annot, 
         annotation_colors = row_colours,
         show_colnames = F,
         show_rownames = F,
         cutree_rows = 3,
         main = str_wrap("Heatmap of sample distances for whole QN expression matrix", 60),
         legend_labels = c("small distance", "large distance"),
         legend_breaks = c(min(., na.rm = TRUE), 
                         max(., na.rm = TRUE)))
```


```{r}
res_joint_dists %>% 
  get_distmat() %>% 
pheatmap(.,
         annotation_row = row_annot, 
         annotation_colors = row_colours,
         show_colnames = F,
         show_rownames = F,
         cutree_rows = 2,
         cutree_cols = 2,
         main = str_wrap("Heatmap of sample distances for class-specific QN expression matrix", 60),
         legend_labels = c("small distance", "large distance"),
         legend_breaks = c(min(., na.rm = TRUE), 
                         max(., na.rm = TRUE)))
```


# Performing SVA

## Performing SVA on regular RMA data


```{r}
full_mod <- mdata_subset %>% 
  select(geo_accession, triple_negative_status) %>% 
  arrange(triple_negative_status) %>% 
  model.matrix(~triple_negative_status, data = .)

head(full_mod)
```


```{r}
red_mod <- model.matrix(~1, data = mdata_subset)

head(red_mod)
```

Get number of significant surrogate variables.

```{r}
n.sv.wholeQN <- num.sv(exprs(res_1), full_mod, method="leek")
```


```{r}
n.sv.wholeQN
```


```{r}
svobj.wholeQN <- sva(exprs(res_1), mod = full_mod, mod0 = red_mod, n.sv = 1)
```


```{r}
sv_df.wholeQN <- tibble("geo_accession" = colnames(exprs(res_1)), "sv" = svobj.wholeQN$sv)

head(sv_df.wholeQN)
```



```{r}
left_join(sv_df.wholeQN, mdata, by = "geo_accession") %>% 
  mutate(index = 5) %>% 
  ggplot() +
  # geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
  geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
  theme_light() +
  labs(y = "Surrogate Variable Value", title = "Distribution of latent variable estimated by SVA for different grouping factors")
  

# ggsave("plots/exploration_plots/sva_grouping_normalRMA.png")
```


## Performing SVA on class-specific quantile normalized data

Create full model matrix.

```{r}
full_mod <- mdata_subset %>% 
  select(geo_accession, triple_negative_status) %>% 
  arrange(triple_negative_status) %>% 
  model.matrix(~triple_negative_status, data = .)

head(full_mod)
```

Create reduced model matrix.

```{r}
red_mod <- model.matrix(~1, data = mdata_subset)

head(red_mod)
```

Get number of significant surrogate variables.

```{r}
n.sv.classQN <- num.sv(res_joint, full_mod, method="leek")
```


```{r}
n.sv.classQN
```


Perform SVA on classQN-normalized expression matrix.


```{r}
svobj.classQN <- sva(res_joint, mod = full_mod, mod0 = red_mod, n.sv = n.sv.classQN)
```


```{r}
sv_df.classQN <- tibble("geo_accession" = colnames(res_joint), "sv" = svobj.classQN$sv)

head(sv_df.classQN)
```

```{r}
saveRDS(sv_df.classQN, "sv_df_classQN.RDS")
# sv_df.classQN <- readRDS("sv_df_classQN.RDS")
```


```{r}
left_join(sv_df.classQN , mdata, by = "geo_accession") %>% 
  mutate(index = 5) %>% 
  ggplot() +
  # geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
  geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
  theme_light() +
  labs(y = "Surrogate Variable Value", title = "Distribution of latent variable estimated by SVA for different grouping factors")
  

ggsave("plots/exploration_plots/sva_grouping_classQN.png")
```


# Trying to see if the SVA estimates a batch when QN is not applied

In this attempt, I perform no quantile normalization while performing RMA. If QN has not been performed and a surrogate variable shows up that corresponds to batch, batch effects are probably present.

```{r}
rawData.summary <- rma(rawData, background = TRUE, normalize = FALSE)
```


```{r}
rawData.summary_df_long <- rawData.summary %>% 
  exprs() %>% 
  as_tibble(rownames = "probeID") %>% 
  pivot_longer(cols = all_of(c(tnbc_samples, nontnbc_samples)), names_to = "sample_id", 
               values_to = "intensity") %>% 
  left_join(., mdata_subset, by = c("sample_id" = "geo_accession"))
```



```{r}
p3 <- rawData.summary_df_long %>% 
  ggplot() +
  geom_boxplot(mapping = aes(x = reorder(sample_id, as.numeric(set)), y = intensity, 
                             color = set)) +
  labs(x = "samples", 
       title = str_wrap("Sample-wise log2 intensity boxplots in the absence of QN", 60)) +
  scale_color_npg() +
  theme(axis.text.x = element_blank())

p3
```


```{r}
ggsave("plots/exploration_plots/GSE76275_noQN_boxplots.png", 
       p1, 
       units = "cm", width = 30, height = 10)
```

Getting the number of surrogate variables in the absence of quantile normalization.

```{r}
n.sv.nonorm <- num.sv(exprs(rawData.summary), full_mod, method="leek")
```

There is one surrogate variable present in the absence of QN.

```{r}
n.sv.nonorm
```


```{r}
svobj.nonorm <- sva(exprs(rawData.summary), mod = full_mod, mod0 = red_mod, n.sv = n.sv.nonorm)
```


```{r}
sv_df.nonorm <- tibble("geo_accession" = colnames(exprs(rawData.summary)), "sv" = svobj.nonorm$sv)

head(sv_df.nonorm)
```


```{r}
left_join(sv_df.nonorm, mdata, by = "geo_accession") %>% 
  mutate(index = 5) %>% 
  ggplot() +
  # geom_col(mapping = aes(y = fct_reorder(geo_accession, sv, .fun = function(x){x}), x = sv, fill = set)) +
  geom_boxplot(mapping = aes(x = submission_date, y = sv, fill = set)) +
  theme_light() +
  labs(y = "Surrogate Variable Value", 
       title = str_wrap("Distribution of latent variable estimated by SVA for different grouping factors", 60))

ggsave("plots/exploration_plots/sva_grouping_noQN.png")
```


